YouTube Video Data Analysis

# Basic libraries 
library(ggplot2)
library(dplyr)
library(tidyverse)
library(corrplot)
library(ggcorrplot)
library(lubridate)
library(jsonlite)
library(stringi)

# Text mining libs
library(SnowballC)
library(tidytext)
library(spacyr)
library(tm)
library(wordcloud2)
library(ggraph)
library(textstem)
library(ggridges)
# Read in the Data from a CSV file.
df <- read.csv('YouTube-videos.csv') #colClasses=c("headline"="character")
# Remove unwanted columns
df$thumbnail_link <- NULL
df$comments_disabled <- NULL
df$video_error_or_removed <- NULL
df$ratings_disabled <- NULL

# Convert Dates
df$trending_date <- as.Date(df$trending_date, "%y.%d.%m")
df$publish_time <- as.Date(df$publish_time, "%Y-%m-%d")

# Change Others to Factors
df$category_id <- as.factor(df$category_id)
# Import the YouTube Category Names
cats <- fromJSON("youtubeVideoCatUS.json", flatten = TRUE)
cats <- as.data.frame(cats)
# Create a new column that contains the English name of the category based on the Category ID
df$category_name <- cats$items.snippet.title[match(df$category_id, cats$items.id)]
df$category_name <- as.factor(df$category_name)

Cleaning

Cleaning up the NAME? in video_id

df %>%
  filter(video_id == '#NAME?') %>%
  summarize(total_records = n())

Replace each occurrence of #NAME? with a random generated string.

df$video_id[df$video_id == '#NAME?'] <- stri_rand_strings(1, 11)
df$video_id <- as.factor(df$video_id)

EDA

Perform Exploratory Data Analysis to better understand the data.

Summary Statistics

str(df)
## 'data.frame':    40881 obs. of  13 variables:
##  $ video_id     : Factor w/ 24104 levels "__4c1JCHvaQ",..: 14219 585 2731 6677 1593 846 321 1464 11920 2086 ...
##  $ trending_date: Date, format: "2017-11-14" "2017-11-14" ...
##  $ title        : chr  "Eminem - Walk On Water (Audio) ft. Beyoncé" "PLUSH - Bad Unboxing Fan Mail" "Racist Superman | Rudy Mancuso, King Bach & Lele Pons" "I Dare You: GOING BALD!?" ...
##  $ channel_title: chr  "EminemVEVO" "iDubbbzTV" "Rudy Mancuso" "nigahiga" ...
##  $ category_id  : Factor w/ 17 levels "1","2","10","15",..: 3 9 9 10 3 11 9 8 10 8 ...
##  $ publish_time : Date, format: "2017-11-10" "2017-11-13" ...
##  $ tags         : chr  "Eminem|\"Walk\"|\"On\"|\"Water\"|\"Aftermath/Shady/Interscope\"|\"Rap\"" "plush|\"bad unboxing\"|\"unboxing\"|\"fan mail\"|\"idubbbztv\"|\"idubbbztv2\"|\"things\"|\"best\"|\"packages\"|"| __truncated__ "racist superman|\"rudy\"|\"mancuso\"|\"king\"|\"bach\"|\"racist\"|\"superman\"|\"love\"|\"rudy mancuso poo bear"| __truncated__ "ryan|\"higa\"|\"higatv\"|\"nigahiga\"|\"i dare you\"|\"idy\"|\"rhpc\"|\"dares\"|\"no truth\"|\"comments\"|\"com"| __truncated__ ...
##  $ views        : int  17158579 1014651 3191434 2095828 33523622 1309699 2987945 748374 4477587 505161 ...
##  $ likes        : int  787425 127794 146035 132239 1634130 103755 187464 57534 292837 4135 ...
##  $ dislikes     : int  43420 1688 5339 1989 21082 4613 9850 2967 4123 976 ...
##  $ comment_count: int  125882 13030 8181 17518 85067 12143 26629 15959 36391 1484 ...
##  $ description  : chr  "Eminem's new track Walk on Water ft. Beyoncé is available everywhere: http://shady.sr/WOWEminem \\nPlaylist Bes"| __truncated__ "STill got a lot of packages. Probably will last for another year. On a side note, more 2nd channel vids soon. e"| __truncated__ "WATCH MY PREVIOUS VIDEO ▶ \\n\\nSUBSCRIBE ► https://www.youtube.com/channel/UC5jkXpfnBhlDjqh0ir5FsIQ?sub_confir"| __truncated__ "I know it's been a while since we did this show, but we're back with what might be the best episode yet!\\nLeav"| __truncated__ ...
##  $ category_name: Factor w/ 17 levels "Autos & Vehicles",..: 9 2 2 4 9 10 2 12 4 12 ...
df %>%
  select(video_id, trending_date, publish_time, views, likes, dislikes, comment_count, category_name) %>%
  summary()
##         video_id     trending_date         publish_time       
##  3wwEC4Co4j6:  525   Min.   :2017-11-14   Min.   :2008-01-13  
##  6ZfuNTqbHE8:    8   1st Qu.:2018-01-04   1st Qu.:2018-01-02  
##  l_lblj8Cq0o:    8   Median :2018-02-26   Median :2018-02-24  
##  UceaB4D0jpo:    8   Mean   :2018-02-27   Mean   :2018-02-23  
##  VYOjWnS4cMY:    8   3rd Qu.:2018-04-24   3rd Qu.:2018-04-23  
##  7X_WvGAhMlQ:    7   Max.   :2018-06-14   Max.   :2018-06-14  
##  (Other)    :40317                                            
##      views               likes            dislikes       comment_count    
##  Min.   :      733   Min.   :      0   Min.   :      0   Min.   :      0  
##  1st Qu.:   143902   1st Qu.:   2191   1st Qu.:     99   1st Qu.:    417  
##  Median :   371204   Median :   8780   Median :    303   Median :   1301  
##  Mean   :  1147036   Mean   :  39583   Mean   :   2009   Mean   :   5043  
##  3rd Qu.:   963302   3rd Qu.:  28717   3rd Qu.:    950   3rd Qu.:   3713  
##  Max.   :137843120   Max.   :5053338   Max.   :1602383   Max.   :1114800  
##                                                                           
##          category_name  
##  Entertainment  :13451  
##  News & Politics: 4159  
##  People & Blogs : 4105  
##  Comedy         : 3773  
##  Music          : 3731  
##  Sports         : 2787  
##  (Other)        : 8875
  • NOTE: Video_ID for the first user shows the same count of 525 as the prior “$NAME?” value.
head(df)
tail(df)

Views, Likes and Dislikes

  • Analyze the number of views, likes, and dislikes with full data visualization (univariate and multivariate). Then comment on various trends.

Views

df %>%
  ggplot(aes(x = views)) +
  geom_histogram(color = "lightblue3", fill = "lightblue", bins = 30) + 
  scale_x_continuous(trans='log10') +
  theme_minimal() +
  theme(
  plot.title = element_text(face = "bold")) +
  labs(
  x = NULL, y = "Count (Log 10)",
  title = "Distribution of Views",
  subtitle = "Using a log10 transformation the x-axis"
  )

df %>%
  group_by(category_name) %>%
  summarise(total_views = mean(views), .groups = "keep") %>%
  arrange(desc(total_views)) %>%
  ggplot(aes(x= category_name, y = total_views)) +
  geom_col(color = "lightblue3", fill = "lightblue") + 
  theme_minimal() +
  theme(
  plot.title = element_text(face = "bold")) +
  labs(
  x = NULL, y = "Count",
  title = "Mean Number of Videos Views per Category"
  ) +
  coord_flip()

df %>%
  ggplot(aes(x=views, color=category_name, fill=category_name)) +
    geom_histogram(alpha=0.6, bins = 30) +
    scale_x_continuous(trans='log10') +
    theme_minimal() +
    theme(
      legend.position="none",
      panel.spacing = unit(0.1, "lines"),
      strip.text.x = element_text(size = 8)
    ) +
    xlab("") +
    ylab("") +
    facet_wrap(~category_name)

df %>%
    ggplot(aes(views, color = category_name)) +
    geom_boxplot() +
    scale_x_continuous(trans='log10') +
    labs(title = "Video Views Distribution by Category",
         x = "Views (Log10 Transformation)",
         y = NULL) +
    theme_minimal() +
    theme(
      legend.title = element_blank(),
      legend.position = "right",
      plot.title = element_text(face = "bold"))

df %>%
  mutate(mon = floor_date(trending_date, 'month')) %>%
  group_by(mon, category_name) %>%
  summarize(total = mean(views), .groups = 'keep') %>%

  ggplot(aes(x=mon, y=total, fill=category_name)) +
  geom_col(color = "black") + 
  scale_x_date(date_breaks = "1 month", expand = c(0,0), date_labels = "%b-%y") +
  theme_classic() +
  theme(
  plot.title = element_text(face = "bold")) +
  labs(
  x = "Trending Date", y = "Views",
  title = "Mean Number of Views per Category",
  subtitle = "Grouped by Month for Trending Date"
  )

Likes

df %>%
  ggplot(aes(x = likes)) +
  geom_histogram(color = "gray", fill = "lightgray", bins = 30) + 
  scale_x_continuous(trans='log10') +
  theme_minimal() +
  theme(
  plot.title = element_text(face = "bold")) +
  labs(
  x = NULL, y = "Count (Log 10)",
  title = "Distribution of Likes",
  subtitle = "Using a log10 transformation the x-axis"
  )

df %>%
  ggplot(aes(x=likes, color=category_name, fill=category_name)) +
    geom_histogram(alpha=0.6, bins = 30) +
    scale_x_continuous(trans='log10') +
    theme_minimal() +
    theme(
      legend.position="none",
      panel.spacing = unit(0.1, "lines"),
      strip.text.x = element_text(size = 8)
    ) +
    xlab("") +
    ylab("") +
    facet_wrap(~category_name)

df %>%
  mutate(mon = floor_date(trending_date, 'month')) %>%
  group_by(mon, category_name) %>%
  summarize(total = mean(likes), .groups = 'keep') %>%

  ggplot(aes(x=mon, y=total, fill=category_name)) +
  geom_col(color = "black") + 
  scale_x_date(date_breaks = "1 month", expand = c(0,0), date_labels = "%b-%y") +
  theme_classic() +
  theme(
  plot.title = element_text(face = "bold")) +
  labs(
  x = "Trending Date", y = "Likes",
  title = "Number of Likes per Category",
  subtitle = "Grouped by Month for Trending Date"
  )

Dislikes

df %>%
  ggplot(aes(x = dislikes)) +
  geom_histogram(color = "gray", fill = "lightgray", bins = 30) + 
  scale_x_continuous(trans='log10') +
  theme_minimal() +
  theme(
  plot.title = element_text(face = "bold")) +
  labs(
  x = NULL, y = "Count (Log 10)",
  title = "Distribution of Dislikes",
  subtitle = "Using a log10 transformation the x-axis"
  )

df %>%
  mutate(mon = floor_date(trending_date, 'month')) %>%
  group_by(mon, category_name) %>%
  summarize(total = mean(dislikes), .groups = 'keep') %>%

  ggplot(aes(x=mon, y=total, fill=category_name)) +
  geom_col(color = "black") + 
  scale_x_date(date_breaks = "1 month", expand = c(0,0), date_labels = "%b-%y") +
  theme_classic() +
  theme(
  plot.title = element_text(face = "bold")) +
  labs(
  x = "Trending Date", y = "Dislikes",
  title = "Number of Dislikes per Category",
  subtitle = "Grouped by Month for Trending Date"
  )

df %>%
  filter(trending_date >= "01-01-2018" & trending_date > "02-01-2018" & category_name == "People & Blogs") %>%
  arrange(desc(dislikes))

Top 5 Videos

  • Identify the top 5 videos based on the number of views and number of likes

Top 5 Videos based on Likes

The following are the top five videos based on Likes. We can see that the top 5 videos are all the same, but with different trending dates.

df %>%
  arrange(desc(likes)) %>%
  select(trending_date, title, likes) %>%
  head(n=5)

The following are the top five unique videos based on likes.

df %>%
  arrange(desc(likes)) %>%
  distinct(video_id, .keep_all = TRUE) %>%
  select(trending_date, title, likes) %>%
  head(n=5) 

Top 5 Videos based on Views

The following are the top five unique videos based on views.

df %>%
  arrange(desc(views)) %>%
  distinct(video_id, .keep_all = TRUE) %>%
  select(trending_date, title, views) %>%
  head(n=5) 

Correlation to Likes

  • What numeric values correlate to likes?
df_corr = df %>% 
  select_if(is.numeric) %>%
  # reordering the numeric columns so likes is listed first. 
  select(likes, views, dislikes, comment_count)

Correlation Test

In order to find out which numeric values correlate to likes, we can create a correlation matrix and correlation plot.

corr <- round(cor(df_corr), 2)
corr
##               likes views dislikes comment_count
## likes          1.00  0.83     0.46          0.84
## views          0.83  1.00     0.56          0.69
## dislikes       0.46  0.56     1.00          0.64
## comment_count  0.84  0.69     0.64          1.00

We can see that all of the other numeric values positively correlate to likes with comment_count being the strongest at 0.84, views being the next strongest at 0.83, dislikes at 0.46

Next we can see a visualization of these values.

ggcorrplot(corr, colors = c("#6D9EC1", "white", "#E46726"),lab = TRUE,  ggtheme = ggplot2::theme_gray)

### Scatterplot

df %>%
  ggplot(aes(x = likes, y = comment_count)) + 
  geom_point(alpha = 0.1, color = "#E46726") + 
  theme_minimal() +
  scale_x_continuous(trans='log10') +
  scale_y_continuous(trans='log10') +
  geom_smooth(method = "lm", se = TRUE, formula = y ~ x, color = "black", linetype = 'dashed') + 
  labs(x = "Views (Log 10 Scale)",
       y = "Likes (Log 10 Scale)",
       title = "Correlation of Likes vs. Views")

After transforming the data on both axes with a log 10 scale, we can see the very linear, positive relationship between the dependent variable Likes, and our strongest correlated independent variable, comment_count.

Next we’ll create a simple linear regression model and a multiple linear regression model and test which performs better.

fit1 <- lm(formula = log1p(likes) ~ log1p(comment_count), data = df)

summary(fit1)
## 
## Call:
## lm(formula = log1p(likes) ~ log1p(comment_count), data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.3344  -0.6043   0.0959   0.6852  10.1327 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          2.875172   0.023547   122.1   <2e-16 ***
## log1p(comment_count) 0.858186   0.003238   265.1   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.271 on 40879 degrees of freedom
## Multiple R-squared:  0.6322, Adjusted R-squared:  0.6322 
## F-statistic: 7.026e+04 on 1 and 40879 DF,  p-value: < 2.2e-16
fit2 <- lm(formula = log1p(likes) ~ log1p(views) + log1p(comment_count) + log1p(dislikes), data = df)

summary(fit2)
## 
## Call:
## lm(formula = log1p(likes) ~ log1p(views) + log1p(comment_count) + 
##     log1p(dislikes), data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.2157 -0.5824  0.1084  0.6662  5.0774 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -0.235145   0.060428  -3.891 9.99e-05 ***
## log1p(views)          0.312152   0.006780  46.040  < 2e-16 ***
## log1p(comment_count)  0.415454   0.003919 106.017  < 2e-16 ***
## log1p(dislikes)       0.387105   0.005751  67.316  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.019 on 40877 degrees of freedom
## Multiple R-squared:  0.7637, Adjusted R-squared:  0.7637 
## F-statistic: 4.403e+04 on 3 and 40877 DF,  p-value: < 2.2e-16
anova(fit1, fit2)

Summary: Both models are significant with very small P-values, and each independent variable in the multiple-regression model is also significant. When we perform an Anova test for significance, we can see that the Multiple Linear model (Fit2) does still hold up and therefore is the better model increasing the Adjusted R-squared from 0.6322 to 0.7637.

Therefore, with Model 2, we can say that for the dependent variable Like, Views, Comment_Count, and Dislikes contribute to 76.37% of the variance.

Text Analysis

Titles

title_corpus <- Corpus(VectorSource(df$title))
title_corpus
## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 40881
inspect(title_corpus[1:4])
## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 4
## 
## [1] Eminem - Walk On Water (Audio) ft. Beyoncé           
## [2] PLUSH - Bad Unboxing Fan Mail                        
## [3] Racist Superman | Rudy Mancuso, King Bach & Lele Pons
## [4] I Dare You: GOING BALD!?

Preprocessing

Using the tm package, perform transformations on the corpus to clean the text. There are generalized text cleaning activities such as normalization and stop word removal.

# standard cleansing
title_corpus <- tm_map(title_corpus, tolower)            # normalize case
## Warning in tm_map.SimpleCorpus(title_corpus, tolower): transformation drops
## documents
title_corpus <- tm_map(title_corpus, removePunctuation)  # remove punctuation
## Warning in tm_map.SimpleCorpus(title_corpus, removePunctuation): transformation
## drops documents
title_corpus <- tm_map(title_corpus, removeNumbers)      # remove numbers
## Warning in tm_map.SimpleCorpus(title_corpus, removeNumbers): transformation
## drops documents
title_corpus <- tm_map(title_corpus, stripWhitespace)    # remove white space
## Warning in tm_map.SimpleCorpus(title_corpus, stripWhitespace): transformation
## drops documents
title_corpus <- tm_map(title_corpus, removeWords, stopwords("english")) # remove stopwords
## Warning in tm_map.SimpleCorpus(title_corpus, removeWords, stopwords("english")):
## transformation drops documents
inspect(title_corpus[1:4])
## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 4
## 
## [1] eminem walk  water audio ft beyoncé             
## [2] plush bad unboxing fan mail                     
## [3] racist superman rudy mancuso king bach lele pons
## [4]  dare  going bald
# stem words using SnowBall stemmer
title_corpus <- tm_map(title_corpus, stemDocument)
## Warning in tm_map.SimpleCorpus(title_corpus, stemDocument): transformation drops
## documents

Document-Term Matrix

Create a Term-Document Matrix from the cleaned Corpus

# The term document matrix is where each word/term is a row with documents as columns
dtm <- TermDocumentMatrix(title_corpus)

# inspect
inspect(dtm)
## <<TermDocumentMatrix (terms: 24539, documents: 40881)>>
## Non-/sparse entries: 250374/1002928485
## Sparsity           : 100%
## Maximal term length: 81
## Weighting          : term frequency (tf)
## Sample             :
##          Docs
## Terms     16041 16336 16821 17065 26967 27278 27487 28468 38427 39186
##   episod      0     0     0     0     0     0     0     0     0     0
##   full        1     1     1     1     1     1     0     0     1     1
##   game        0     0     0     0     0     0     0     0     0     0
##   music       0     0     0     0     0     0     0     0     0     0
##   new         0     0     0     0     0     0     0     0     0     0
##   offici      0     0     0     0     0     0     0     0     0     0
##   song        0     0     0     0     0     0     0     0     0     0
##   trailer     0     0     0     0     0     0     0     0     0     0
##   trump       0     0     0     0     0     0     0     0     0     0
##   video       0     0     0     0     0     0     0     0     0     0
dtm1 = removeSparseTerms(dtm, 0.99)

Perform Analysis

Frequent Terms

  • Use freqwords(): find frequent terms in a document-term or term-document matrix.
  • Find terms that occur at least 5 times and show top 50
findFreqTerms(dtm1, 5) %>%
  head(50)
##  [1] "music"     "offici"    "video"     "paul"      "new"       "final"    
##  [7] "food"      "game"      "full"      "latest"    "punjabi"   "song"     
## [13] "react"     "time"      "top"       "youtub"    "show"      "test"     
## [19] "real"      "feat"      "tri"       "live"      "trump"     "season"   
## [25] "break"     "best"      "trailer"   "episod"    "get"       "make"     
## [31] "highlight" "part"      "life"      "day"       "first"     "movi"     
## [37] "drama"     "challeng"  "nba"       "news"      "الحلقة"    "war"
termCount <- rowSums(as.matrix(dtm1))  # sums rows
termCount <- subset(termCount, termCount >=20)

df2 <- data.frame(term = names(termCount), freq = termCount) 
df2 %>%
  head(35) %>%
  ggplot( aes(x = reorder(term, freq), y = freq, fill= freq)) + 
    geom_bar(stat = "identity") +
    scale_colour_gradientn(colors = terrain.colors(10)) + 
    theme_classic() +
    coord_flip() +
    theme(
    plot.title = element_text(face = "bold")) +
    labs(
    x = NULL, y = "Count",
    title = "Most Frequently Occuring Words in Titles"
    )

Descriptions

description_corpus <- Corpus(VectorSource(df$description))
description_corpus
## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 40881
inspect(description_corpus[1:2])
## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 2
## 
## [1] Eminem's new track Walk on Water ft. Beyoncé is available everywhere: http://shady.sr/WOWEminem \\nPlaylist Best of Eminem: https://goo.gl/AquNpo\\nSubscribe for more: https://goo.gl/DxCrDV\\n\\nFor more visit: \\nhttp://eminem.com\\nhttp://facebook.com/eminem\\nhttp://twitter.com/eminem\\nhttp://instagram.com/eminem\\nhttp://eminem.tumblr.com\\nhttp://shadyrecords.com\\nhttp://facebook.com/shadyrecords\\nhttp://twitter.com/shadyrecords\\nhttp://instagram.com/shadyrecords\\nhttp://trustshady.tumblr.com\\n\\nMusic video by Eminem performing Walk On Water. (C) 2017 Aftermath Records\\nhttp://vevo.ly/gA7xKt                                                                                                             
## [2] STill got a lot of packages. Probably will last for another year. On a side note, more 2nd channel vids soon. editing with premiere from now on, gon' be a tedious transition, but i think it's for the best. \\n\\n__\\n\\nSUBSCRIBE ► http://www.youtube.com/subscription_center?add_user=iDubbbztv\\n\\nMain Channel ► https://www.youtube.com/user/iDubbbzTV\\nSecond Channel ► https://www.youtube.com/channel/UC-tsNNJ3yIW98MtPH6PWFAQ\\nGaming Channel ► https://www.youtube.com/channel/UCVhfFXNY0z3-mbrTh1OYRXA\\n\\nWebsite ► http://www.idubbbz.com/\\n\\nInstagram ► https://instagram.com/idubbbz/\\nTwitter ► https://twitter.com/Idubbbz\\nFacebook ► http://www.facebook.com/IDubbbz\\nTwitch ► http://www.twitch.tv/idubbbz\\n_

Preprocessing

Using the tm package, perform transformations on the corpus to clean the text. There are generalized text cleaning activities such as normalization and stop word removal.

# Remove URLs
description_corpus <- tm_map(description_corpus, 
                             content_transformer(function(x) gsub("http[[:alnum:][:punct:]]*", "", x)))
## Warning in tm_map.SimpleCorpus(description_corpus,
## content_transformer(function(x) gsub("http[[:alnum:][:punct:]]*", :
## transformation drops documents
# Replace new line symbols with a space
description_corpus <- tm_map(description_corpus, 
                             content_transformer(function(x) gsub("\\\\n", "", x)))
## Warning in tm_map.SimpleCorpus(description_corpus,
## content_transformer(function(x) gsub("\\\\n", : transformation drops documents
# Remove the odd "arrow" symbol
description_corpus <- tm_map(description_corpus, 
                             content_transformer(function(x) gsub("►", "", x)))
## Warning in tm_map.SimpleCorpus(description_corpus,
## content_transformer(function(x) gsub("►", : transformation drops documents
# Remove the odd "arrow" symbol
description_corpus <- tm_map(description_corpus, 
                             content_transformer(function(x) gsub("▶", "", x)))
## Warning in tm_map.SimpleCorpus(description_corpus,
## content_transformer(function(x) gsub("▶", : transformation drops documents
inspect(description_corpus[1:4])
## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 4
## 
## [1] Eminem's new track Walk on Water ft. Beyoncé is available everywhere:  Playlist Best of Eminem:  for more:  more visit:  video by Eminem performing Walk On Water. (C) 2017 Aftermath Records                                                                                                                                                                              
## [2] STill got a lot of packages. Probably will last for another year. On a side note, more 2nd channel vids soon. editing with premiere from now on, gon' be a tedious transition, but i think it's for the best. __SUBSCRIBE   Channel   Channel   Channel                                                                                                                    
## [3] WATCH MY PREVIOUS VIDEO  SUBSCRIBE   FOR WATCHING! LIKE & SUBSCRIBE FOR MORE VIDEOS!-----------------------------------------------------------FIND ME ON: Instagram |  |  |  Rudy Mancuso |  Pons |  Bach |  Effects: Caleb Natale |  GregoryShots Studios Channels:Alesso |  |  Jibawi |  Puppets |  Stocking |  Sarkis |  Pons |  |  Tyson |  Rudy Mancuso |  Studios | 
## [4] I know it's been a while since we did this show, but we're back with what might be the best episode yet!Leave your dares in the comment section! Order my book how to write good  Launched New Official Store Channel us mail or whatever you want here!PO Box 232355Las Vegas, NV 89105
# standard cleansing
description_corpus <- tm_map(description_corpus, tolower)            # normalize case
## Warning in tm_map.SimpleCorpus(description_corpus, tolower): transformation
## drops documents
description_corpus <- tm_map(description_corpus, removePunctuation)  # remove punctuation
## Warning in tm_map.SimpleCorpus(description_corpus, removePunctuation):
## transformation drops documents
description_corpus <- tm_map(description_corpus, removeNumbers)      # remove numbers
## Warning in tm_map.SimpleCorpus(description_corpus, removeNumbers):
## transformation drops documents
description_corpus <- tm_map(description_corpus, stripWhitespace)    # remove white space
## Warning in tm_map.SimpleCorpus(description_corpus, stripWhitespace):
## transformation drops documents
description_corpus <- tm_map(description_corpus, removeWords, stopwords("english")) # remove stopwords
## Warning in tm_map.SimpleCorpus(description_corpus, removeWords,
## stopwords("english")): transformation drops documents
inspect(description_corpus[1:4])
## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 4
## 
## [1] eminems new track walk  water ft beyoncé  available everywhere playlist best  eminem    visit video  eminem performing walk  water c aftermath records                                                                           
## [2] still got  lot  packages probably will last  another year   side note  nd channel vids soon editing  premiere  now  gon   tedious transition   think    best subscribe channel channel channel                                   
## [3] watch  previous video subscribe  watching like subscribe   videosfind   instagram rudy mancuso pons bach effects caleb natale gregoryshots studios channelsalesso jibawi puppets stocking sarkis pons tyson rudy mancuso studios 
## [4]  know     since    show   back   might   best episode yetleave  dares   comment section order  book   write good launched new official store channel us mail  whatever  want herepo box las vegas nv
# Use the Snowball Stemmer on the Corpus
description_corpus <- tm_map(description_corpus, stemDocument)
## Warning in tm_map.SimpleCorpus(description_corpus, stemDocument): transformation
## drops documents

Document-Term Matrix

Create a Term-Document Matrix from the cleaned Corpus

# The term document matrix is where each word/term is a row with documents as columns
description_dtm <- TermDocumentMatrix(description_corpus)

# inspect
inspect(description_dtm)
## <<TermDocumentMatrix (terms: 133227, documents: 40881)>>
## Non-/sparse entries: 2087309/5444365678
## Sparsity           : 100%
## Maximal term length: 1561
## Weighting          : term frequency (tf)
## Sample             :
##           Docs
## Terms      11818 19426 19742 21704 35998 38860 39007 40365 8091 8391
##   channel      0     0     0     2     0     0     0     0    0    0
##   facebook     0     0     0     0     0     0     0     0    0    0
##   get          0     0     0     0     0     0     0     0    0    0
##   music        0     0     0     0     4     3     3     0    0    0
##   new          0     0     0     0     2     3     3     0    0    0
##   show         0     0     0     0     3     3     3     0    0    0
##   subscrib     0     0     0     1     2     2     2     0    0    0
##   twitter      0     0     0     0     0     0     0     0    0    0
##   video        0     0     0     0     6     6     6     0    0    0
##   watch        1     0     0     0     0     0     0     0    0    0
description_dtm1 = removeSparseTerms(description_dtm, 0.99)

Frequent Terms

  • Use freqwords(): find frequent terms in a document-term or term-document matrix.
  • Find terms that occur at least 5 times and show top 50
findFreqTerms(description_dtm1, 5) %>%
  head(50)
##  [1] "avail"     "best"      "new"       "perform"   "playlist"  "record"   
##  [7] "track"     "video"     "visit"     "walk"      "water"     "anoth"    
## [13] "channel"   "edit"      "got"       "last"      "lot"       "note"     
## [19] "now"       "premier"   "side"      "soon"      "still"     "subscrib" 
## [25] "think"     "will"      "year"      "effect"    "instagram" "like"     
## [31] "previous"  "studio"    "watch"     "back"      "book"      "box"      
## [37] "comment"   "episod"    "good"      "know"      "mail"      "might"    
## [43] "offici"    "order"     "section"   "show"      "sinc"      "store"    
## [49] "want"      "addit"
termCount <- rowSums(as.matrix(description_dtm1))  # sums rows
termCount <- subset(termCount, termCount >=20)

description_df <- data.frame(term = names(termCount), freq = termCount) 
description_df %>%
  head(35) %>%
  ggplot( aes(x = reorder(term, freq), y = freq, fill= freq)) + 
    geom_bar(stat = "identity") +
    scale_colour_gradientn(colors = terrain.colors(10)) + 
    theme_classic() +
    coord_flip() +
    theme(
    plot.title = element_text(face = "bold")) +
    labs(
    x = NULL, y = "Count",
    title = "Most Frequently Occuring Words in Titles"
    )